package com.formulasearchengine.mathosphere.mlp.text;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.datatype.guava.GuavaModule;
import com.google.common.collect.HashMultiset;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Multiset;
import com.google.common.collect.Multisets;
import com.formulasearchengine.mathosphere.mlp.cli.CountCommandConfig;
import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVPrinter;
import org.apache.flink.api.java.tuple.Tuple2;
import org.codehaus.jackson.JsonFactory;
import org.codehaus.jackson.JsonParser;
import org.codehaus.jackson.JsonToken;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.PrintWriter;
import static com.formulasearchengine.mathosphere.mlp.RelationExtractor.createPrinter;
/**
* Created by Moritz on 07.10.2015.
*
* @TODO: Investigate https://github.com/FasterXML/jackson-databind
*/
public class TokenCounter {
public abstract class Extractor<T> {
public final Multiset<T> tokens;
public Extractor() {
tokens = HashMultiset.create();
}
abstract public void addFromJson(JsonParser jsonParser) throws IOException;
}
public class TupleExtractor extends Extractor<Tuple2<String, String>> {
public void addFromJson(JsonParser jParser) throws IOException {
jParser.nextToken(); // [
String type = jParser.getText();
jParser.nextToken();
String value = jParser.getText();
jParser.nextToken(); // ]
tokens.add(new Tuple2<>(type, value));
}
}
public class IdentifierExtractor extends Extractor<String> {
public void addFromJson(JsonParser jParser) throws IOException {
String value = jParser.getText();
tokens.add(value);
}
}
public Multiset<Tuple2<String, String>> countTokens(InputStream in) throws IOException {
TupleExtractor tokens = new TupleExtractor();
processFile(in, tokens);
return tokens.tokens;
}
public Multiset<String> countIdentifer(InputStream in) throws IOException {
IdentifierExtractor tokens = new IdentifierExtractor();
processFile(in, tokens);
return tokens.tokens;
}
private void processFile(InputStream in, Extractor tokens) throws IOException {
JsonFactory jfactory = new JsonFactory();
JsonParser jParser = jfactory.createJsonParser(in);
String hash = "";
if (jParser.nextToken() != JsonToken.START_ARRAY) {
emitError("Expected a JSON array Unexpected token " + jParser.getText());
}
while (jParser.nextToken() != JsonToken.END_ARRAY) {
if (jParser.getCurrentToken() == JsonToken.START_OBJECT) {
jParser.nextToken();
if (jParser.getCurrentName().equals("inputhash")) {
jParser.nextToken();
hash = jParser.getText();
jParser.nextToken();
} else {
emitError("Missing inputhash " + jParser.getText());
}
} else {
emitError("Unexpected token " + jParser.getText());
}
switch (jParser.getCurrentName()) {
case "tokens":
case "texvcinfo":
if (jParser.nextToken() == JsonToken.START_ARRAY) {
while (jParser.nextToken() != JsonToken.END_ARRAY) {
tokens.addFromJson(jParser);
}
} else if ((jParser.nextToken() == JsonToken.START_OBJECT)) {
if (jParser.nextToken() != JsonToken.END_OBJECT) {
//noinspection StatementWithEmptyBody
do {
} while (jParser.nextToken() != JsonToken.END_OBJECT);
}
} else {
emitError("[ after texvcinfo expected in " + hash + " but got " + jParser.getText());
}
break;
default:
emitError("Unexpected token" + jParser.getText());
}
if (jParser.nextToken() != JsonToken.END_OBJECT) {
emitError("Missing object end");
}
}
jParser.close();
}
private void emitError(String message) throws IOException {
throw new IOException(message);
}
public static void run(CountCommandConfig config) {
try {
PrintWriter pw = createPrinter(config);
InputStream in = new FileInputStream(config.getInput());
TokenCounter tokenCounter = new TokenCounter();
ObjectMapper mapper = new ObjectMapper().registerModule(new GuavaModule());
if (config.isIdentifiers()) {
ImmutableSet<Multiset.Entry<String>> entries = Multisets.copyHighestCountFirst(tokenCounter.countIdentifer(in)).entrySet();
if (config.isCsv()) {
CSVPrinter printer = CSVFormat.DEFAULT.withHeader("tex", "count").withRecordSeparator("\n").print(pw);
for (Multiset.Entry<String> entry : entries) {
String[] output = {entry.getElement(), String.valueOf(entry.getCount())};
printer.printRecord(output);
}
} else {
mapper.writeValue(pw, entries);
}
} else {
ImmutableSet<Multiset.Entry<Tuple2<String, String>>> entries = Multisets.copyHighestCountFirst(tokenCounter.countTokens(in)).entrySet();
mapper.writeValue(pw, entries);
}
pw.flush();
} catch (Exception e) {
e.printStackTrace();
}
}
}